********************************************************************************
** Cederman, Galano, Girardin and Schvitz. War Did Make States.
** Article prepared for International Organization
** June 20, 2022
**
** Stata do-file: data_prep1.do
** First data preparation file for state-level data
** Required file paths set in runall.do
********************************************************************************

cd $ROOT
cd $INPUTDIR

insheet using $INPUTFILE, clear

drop if id == .
drop if year == .
xtset id year
sort id year

// State-level war variable
gen inc1 = .
replace inc1 = 0 if incidencebrecke==0
replace inc1 = 1 if incidencebrecke>0 & incidencebrecke!=.

// Wars during entire lifespan
bys id: egen sumwaryears = sum(inc1)
bys id: egen sumyears = sum(1)
gen relwaryears = sumwaryears/sumyears

// Cumulative war years 
bysort id (year): gen waryears = sum(inc1)
gen lwaryears = l5.waryears

// State birth and death variables
xtset id year

bys id: egen maxyear = max(year)
bys id: egen minyear = min(year)

gen birth = 0
replace birth = 1 if id!=. & l5.id==. 
gen firstbirth = 0
replace firstbirth = 1 if birth==1 & year==minyear

gen death = 0
replace death = 1 if id!=. & f5.id==. & year<1790
gen finaldeath = 0
replace finaldeath = 1 if year == maxyear & year<1790

// Death history variables (not used)
bysort id (year): gen deaths = sum(death)
gen deaths1 = 0
replace deaths1 = 1 if deaths > 0 & deaths != .

// Duration dependence var
btscs death year id, gen(lifeyears) nspline(3)
rename _spline1  spline1
rename _spline2  spline2
rename _spline3  spline3

// log and lag of state size (area) and war variable
gen lnarea = log(area)
gen llarea = l5.lnarea
gen linc1 = l5.inc1

// State age
bys id (year): gen age = sum(5)
gen llage = log(l5.age+5)

// Prepare cumulative variables at state level
replace growthwarbreckearea = 0 if growthwarbreckearea == .
replace growthpeacebreckearea = 0 if growthpeacebreckearea == .

gen lwargrowth = log(growthwarbreckearea + 1)
gen lpeacegrowth = log(growthpeacebreckearea + 1)

gen llwargrowth = log(l5.growthwarbreckearea + 1)
gen llpeacegrowth = log(l5.growthpeacebreckearea + 1)

gen llgrowth = log(l5.growthwarbreckearea + l5.growthpeacebreckearea + 1)

replace shrinkwarbreckearea = 0 if shrinkwarbreckearea == .
replace shrinkpeacebreckearea = 0 if shrinkpeacebreckearea == .

gen lwarshrink = log(shrinkwarbreckearea + 1)
gen lpeaceshrink = log(shrinkpeacebreckearea + 1)

gen llwarshrink = log(l5.shrinkwarbreckearea + 1)
gen llpeaceshrink = log(l5.shrinkpeacebreckearea + 1)

gen llshrink = log(l5.shrinkwarbreckearea + shrinkpeacebreckearea + 1)

gen netwargrowth = growthwarbreckearea - shrinkwarbreckearea
gen netpeacegrowth = growthpeacebreckearea - shrinkpeacebreckearea


// Variations on gain/loss dep. variables (not all used)

replace gainarea = 0 if gainarea == .
replace lossarea = 0 if lossarea == .

gen netgain = gainarea - lossarea

gen gain = netgain
replace gain = 0 if netgain<0 & netgain!=.

gen loss = 0
replace loss = abs(netgain) if netgain<0 & netgain!=.
 
gen loss_nd = loss
replace loss_nd = . if death==1

gen lngain = log(gain + 1)
gen lnloss = log(loss + 1)
gen lnloss_nd = log(loss_nd + 1)

gen rloss = 0
replace rloss = loss/l5.area

gen rgain = 0
replace rgain = gain/l5.area

gen lnrloss = log(rloss+1)
gen lnrgain = log(rgain+1)

gen rloss_nd = .
replace rloss_nd = loss_nd/l5.area
gen lnrloss_nd = log(rloss_nd + 1)

gen gain1 = 0 
replace gain1 = 1 if netgain > 0 & netgain!=.

gen loss1 = 0
replace loss1 = 1 if loss > 0 & loss!=.

gen loss_nd1 = loss_nd
replace loss_nd1 = 1 if loss_nd>0 




// Duration variables for state-level models

btscs gain1 year id, gen(nogainyear) nspline(3)
rename _spline1  ngspline1
rename _spline2  ngspline2
rename _spline3  ngspline3

btscs loss1 year id, gen(nolossyear) nspline(3)
rename _spline1  nlspline1
rename _spline2  nlspline2
rename _spline3  nlspline3

gen event = gain1
replace event = 1 if loss1 == 1

btscs event year id, gen(noeventyear) nspline(3)
rename _spline1  evspline1
rename _spline2  evspline2
rename _spline3  evspline3


// Urbanization variables and city state dummies (the latter not used)

gen urban = populationurban/population
gen lurban = l5.urban
gen citystate = .
replace citystate = 0 if urban<=0.4 & llarea<8
replace citystate = 1 if urban>0.4 & llarea<8


// European centrality measures for state location

gen llcentraldist = log(l5.systemcenterdistancecentroid+1)
by id (year), sort: egen lmedcentraldist = median(l5.systemcenterdistancecentroid)
gen lcentral = 0
replace lcentral = 1 if l5.systemcenterdistancecentroid <= lmedcentraldist


// Paritioning of sample into states above and below median size

bys year: egen medarea = median(lnarea) 
gen abovemed = .
replace abovemed = 1 if lnarea >= medarea
replace abovemed = 0 if lnarea < medarea


// Save intermeidate result for further processing in data_prep_dyadic and data_prep2

cd $ROOT
cd $INTERMEDIATEDIR
sort id year
save "statedata_intermediate.dta", replace 
